{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.feature_extraction.text import TfidfTransformer # TF-IDF\n",
    "from sklearn.model_selection import train_test_split         # 切分数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "pi = pd.read_csv('data\\个人信息.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.0    6973\n",
       "1.0    2754\n",
       "2.0    1234\n",
       "Name: sex, dtype: int64"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pi.fillna(0., inplace=True)    # 0和None值表示未知，故将None值填充为0，全部用0表示\n",
    "pi.sex.value_counts()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "性别：性别为1表示男性，2表示女性，0表示未知。  \n",
    "有6973个缺失值，考虑不同性别主播在用户昵称上有较大的区别（女性多用可爱字眼如\"鸭、哇、妹妹\"等），决定通过用户昵称来对性别进行分类，用于缺失值的填充。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "> 假设特征之间是独立的"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**第一步：分词**  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['时 代 - 十 四 剑 姬', 'D a e - 北 枫 c', '时 代 - 老 枪 赵 信 【 李 弟 】']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 去除停用词、每个字为一个特征，处理后的昵称都存放在datalist\n",
    "dataList = pi[pi.sex!=0].昵称.map(list).map(lambda x: ' '.join(x)).to_list()\n",
    "classList = pi[pi.sex!=0].sex     # 类别\n",
    "dataList[:3]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**第二步：划分数据集**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train, X_test, Y_train, Y_test = train_test_split(dataList, classList, test_size=300, random_state=123)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**第三步：转为词频矩阵**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "# 去除停用词\n",
    "stop_words = list('!\"#$%&\\'·丨『』☆()（）*+,-./:：;<=>?@，。?★、…【】《》？“”‘！[\\\\]^_`{|}~') + [' ', '\\x83', '®', '°', '͡', 'ζ', '۞', '۩', 'އ', 'ޓ', 'ั', '๑', '๛', '\\u200d', '—', '•', '₀', '₁', '₅', '₆', '₈', 'ₑ', '√', '⌒', '◆', '◇', '☀', '☁', '☂', '☜', '☞', '♂', '♏', '♔', '♡', '♬', '✿', '❀', '❤', '〆']\n",
    "count_vect = CountVectorizer(stop_words=stop_words, analyzer=\"char\", lowercase=False)\n",
    "X_train_counts = count_vect.fit_transform(X_train)  # 拟合模型，并返回文本矩阵\n",
    "# 将文本中的词语转换为词频矩阵。即各个词语出现的次数，通过get_feature_names()可看到所有文本的关键字，通过toarray()可看到词频矩阵的结果。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[3, 0, 0, ..., 0, 0, 0],\n",
       "       [1, 0, 0, ..., 0, 0, 0],\n",
       "       [2, 0, 0, ..., 0, 0, 0],\n",
       "       ...,\n",
       "       [6, 0, 0, ..., 0, 0, 0],\n",
       "       [5, 0, 0, ..., 0, 0, 0],\n",
       "       [2, 0, 0, ..., 0, 0, 0]], dtype=int64)"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train_counts.toarray()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "矩阵元素a[i][j] 表示j词在第i个文本下的词频。其中第1列的文本是' '，侧面反映了文本长度。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3688, 2160)"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train_counts.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "共有3998个昵称，词典中共有2106个单词/中文。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1 3 1 1 1]\n",
      "[0 5]\n",
      "[1332    0 1011 1405  682]\n",
      "爱\n"
     ]
    }
   ],
   "source": [
    "# 查看第一个昵称的相关信息\n",
    "A = X_train_counts[0]\n",
    "print(A.data)     # 各个单词出现的次数\n",
    "print(A.indptr)   # 一共有多少单词\n",
    "print(A.indices)  # 各个单词的编号\n",
    "print(count_vect.get_feature_names()[A.indices[0]]) # 打印某一个序号所对应的单词"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**第四步：朴素贝叶斯分类**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.naive_bayes import MultinomialNB\n",
    "clf = MultinomialNB().fit(X_train_counts, Y_train)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "测试集分类效果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.73"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_test_counts = count_vect.transform(X_test)   # 转为词向量\n",
    "predicted = clf.predict(X_test_counts)\n",
    "np.mean(predicted==Y_test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "应用到未知性别的数据（pre_data）分类中。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "pre_data = pi[pi.sex==0].昵称.map(list).map(lambda x: ' '.join(x)).to_list()\n",
    "pre_data = count_vect.transform(pre_data)     # 转为词频矩阵\n",
    "\n",
    "pre = clf.predict(pre_data)\n",
    "pi.loc[pi.sex==0, 'sex'] = pre"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**第五步：保存到`用户id_性别字典.txt`中**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "sex_dict = pi.set_index('用户id').sex.to_dict()\n",
    "with open('data\\用户id_性别字典.txt', 'w') as f:\n",
    "    f.write(str(sex_dict))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.7",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
